# Load packages
require(RPostgreSQL)
require(getPass)
library(tidyverse)
library(sf)
library(lubridate)
library(zoo) # to extend the apply family of functions (rolling window)
# If data preprocessing is needed, do it here.
# functions:
flagR <- function(x) { # function to flag values in quality check 1
ifelse(x>=1, 1,0) # sets flag to 0 when passed, to 1 when failed
}
flagP <- function(x){ # function flags values in quality check 3 (variability)
ifelse(length(unique(x)) == 1, 1,0) # no variability -> 1 unique val
} # must be applied using a rolling window
Hobo picture and description
image of my Hobo
Read in from GitHub
# Load data from Github (then eval = TRUE)
my_HOBO <- read.csv("https://raw.githubusercontent.com/data-hydenv/data/master/hobo/2023/raw/10350049.csv", skip = 1)
# Load calibration file from GitHub
t_cal <- read.csv("https://raw.githubusercontent.com/data-hydenv/data/master/hobo/2023/calibration.csv")
Date/Time column must be parsed properly:
my_HOBO$Datum.Zeit..GMT.01.00 <- mdy_hms(my_HOBO$Datum.Zeit..GMT.01.00)
Tidy up and rename the columns
my_HOBO <- as_tibble(my_HOBO) %>%
select(., 1:4) %>%
rename(id = 1, dttm = 2, temp = 3, lux = 4)
head(my_HOBO)
The calibration data from GitHub is a table of the time when the calibration measurement was performed and the target value temperature.
caltime <- t_cal$This.is.the.calibration.value[3] # time of calibration
calib_line <- which(my_HOBO$dttm == caltime) # according line in raw data
meas_temp <- num(my_HOBO[calib_line,]$temp, digits = 3) # what the HOBO measured
meas_temp <- meas_temp[1] # as non-numbered number
tru_temp <- num(as.numeric(t_cal$to.calibrate.your.Hobo.measurements.in..C.[3]),
digits = 3) # what the target value is
tru_temp <- tru_temp[1] # target value as non-numbered number
cal_offset <- meas_temp-tru_temp # difference of the two
cal_offset <- as.numeric(cal_offset[1])
# so -0.267 is now my calibration offset
my_HOBO$temp <- my_HOBO$temp-cal_offset # subtract offset from raw temperature
Truncate data to the given time frame. First declare the target start and end time using ´lubridate::ymd_hms´.
start_time <- ymd_hms("2022-12-01 00:00:00") # start point
end_time <- ymd_hms("2023-01-07 23:50:00") # end point
time_range <- interval(start_time, end_time) # may be useful later
Filter the raw data to clip it according to the start and end times
and also overwrite the id column with a new value starting from
1. format() was used to format decimals in the
light intensity and temperature vectors and to change the timestamp
format.
my_HOBO <- my_HOBO %>%
filter(between(dttm, start_time, end_time)) # filter
my_HOBO <- my_HOBO %>%
mutate(., id = c(1:length(my_HOBO$id))) # and fix ID col
my_HOBO$temp <- format(my_HOBO$temp, digits=3, nsmall=3) # fix decimals in temp
my_HOBO$lux <- format(my_HOBO$lux, digits=3, nsmall=3) # and in light intensity
my_HOBO$dttm <- format(my_HOBO$dttm, "%Y-%m-%d %H:%M") # timestamp format
The tibble: my_HOBO was then written to disk and
uploaded to GitHub.
# write to file
write_csv(my_HOBO, file = "10350049.csv")
Read the data back in from GitHub…
# Load data from Github (then eval = TRUE)
HOBO_qc <- read.csv("https://raw.githubusercontent.com/data-hydenv/data/master/hobo/2023/10_minutes/10350049.csv")
head(data)
1 function (..., list = character(), package = NULL, lib.loc = NULL,
2 verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
3 {
4 fileExt <- function(x) {
5 db <- grepl("\\\\.[^.]+\\\\.(gz|bz2|xz)$", x)
6 ans <- sub(".*\\\\.", "", x)
…and check both value ranges.
# TODO NAs
range(HOBO_qc$temp)
[1] -8.714 21.397
range(HOBO_qc$lux)
[1] 0.0 2755.6
Question: How many data points are outside the measurement range?
Answer: None
# lag one column, then mutate difference to new column
HOBO_wip <- HOBO_qc %>%
mutate(., lagged=lag(temp)) %>%
mutate(., delta=temp-lagged)
range(na.omit(HOBO_wip$delta)) # delta values out of legal range?
[1] -1.528 2.906
Question: Describe shortly how many data points failed during this QCP and discuss whether there is a certain daytime pattern of failure or not?
Answer:
qc_R <- sapply(HOBO_wip$delta,flagR) # apply flagging function
HOBO_wip <- cbind(HOBO_wip,qc_R) # collect results in new data frame
length(which(qc_R==1)) # count occurence of bad data points
[1] 24
# TODO day/night pattern?
Use zoo::rollapply() to run the persistence check over a
moving time window of 6 time steps (i.e. 60 minutes).
qc2 <- rollapply(HOBO_wip$delta, width=6,FUN=flagP) # width parameter is window
Task: Code in this section should analyse the persistence.
# TODO
Compile results
qc_P <- c(1:length(HOBO_wip$delta)) # vector to store persistence flag
qc_P[1:5] <- NA # manually fill positions 1 through 5
qc_P[6:length(qc_P)] <- qc2 # append results
HOBO_wip <- cbind(HOBO_wip,qc_P) # collect in tibble
The HOBO was situated in a planter at all times so there was no disturbance of the light measurement.
hist(HOBO_wip$lux, breaks = 25)
summary(HOBO_wip$lux) # quite the spread
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 0.0 114.0 86.1 2755.6
sd(HOBO_wip$lux)
[1] 283.974
toplux <- filter(HOBO_wip,lux<=500 & lux>1)
hist(toplux$lux,breaks=50)
Assign SICs from SOURCE!!! to light measurement data points
HOBO_wip <- HOBO_wip %>%
mutate(SIC = case_when(lux <= 10 ~ 'Night_0',
lux <= 500 ~ 'Rise_Set_1',
lux <= 2000 ~ 'Overcast_full_2',
lux <= 15000 ~ 'Overcast_light_3',
lux <= 20000 ~ 'Clear_4',
lux < 50000 ~ 'Sunshine_5',
lux >= 50000 ~ 'Brightshine_6',
))
unique(HOBO_wip$SIC) # only the first four SICs present
[1] "Night_0" "Rise_Set_1" "Overcast_full_2"
[4] "Overcast_light_3"
Task: Discuss shortly how often and when during daytime the QCP4 flags bad data. Elaborate on some reasons for your results.
Answer:
# analyse occurence of flags and aggregate qc fails
HOBO_wip <- HOBO_wip %>%
mutate(qc_tot = qc_P + qc_R)
HOBO_wip$qc_tot[1:5] <- 0
HOBO_wip <- HOBO_wip %>%
mutate(qc_all = case_when(qc_tot == 0 ~ 0,
qc_tot != 0 ~ 1))
# as percentage
qc_result <- table(HOBO_wip$qc_all)
qc_result
0 1
5269 203
num(((qc_result[2]/length(HOBO_wip$qc_all))*100),digits=4)
<pillar_num:.4![1]>
1
3.7098
# TODO
# Present a table or graph to show how many data points fail during the four specific QCPs. Discuss shortly
# the reasons for failure and compare the different QCPs against each other.
# create qc_df
# TODO
qc_df <- as.tibble(HOBO_wip)
# numbering all hours
hour_ct <- rep(c(1:912), each = 6)
# stick it to the df
qc_df <- cbind(qc_df, h_ct = hour_ct)
# qc_df <- qc_df %>%
# mutate(flag_)
table(qc_df$qc_tot)
0 1
5269 203
# I have 203 cases but each one has failed only one of the quality checks
test <- qc_df %>%
group_by(., h_flag = h_ct) %>%
summarize(., sum_flags=sum(qc_all)) %>%
filter(sum_flags>=2)
# this yields a vector of all hours that must be set to NA
bad_hours <- test$h_flag
qc_df2 <- qc_df
bad_temps <- which(qc_df2$h_ct %in% bad_hours)
qc_df2$temp[bad_temps] <- NA
Share of NAs
# Calculate the share of NAs in your hourly time-series in % and write it into the
# meta table (sheet: “Quality
hrly_dat <- qc_df2 %>%
group_by(., h_ct) %>%
summarise(., mean_temp=mean(temp))
# this yields hourly means with NAs
# share of NA in hourly time series
table(is.na(hrly_dat$mean_temp))
FALSE TRUE
862 50
# 50 NAs
num((50/length(hrly_dat$mean_temp))*100,digits = 4)
<pillar_num:.4![1]>
[1] 5.4825
# code for summarizing here
Task: Present a table or graph to show how many data points fail during the four specific QCPs. Discuss shortly the reasons for failure and compare the different QCPs against each other.
Answer:
Task: At the end of the code section above you
should generate one! tibble or data.frame named qc_df with
all time information, all data points (temperature and lux) and your
outcomes of the different QCPs.
Answer:
# code for aggregation here
Task: At the end of the code section above you
should generate one! tibble or data.frame named hobo_hourly
with averaged temperature values per hour or NA values (if the hour is
flagged as bad data). See exercise description for more details.
dttm <- seq(start_time, end_time, by= "hours")
hobo_hourly <- hrly_dat %>%
select(th=mean_temp) %>%
mutate(date_time=dttm) %>%
mutate(origin=rep("H"))
hobo_hourly$date_time <- format(hobo_hourly$date_time, "%Y-%m-%d %H:%M:%S")
hobo_hourly$th <- format(hobo_hourly$th, digits=3, nsmall=3)
write_csv(hobo_hourly, file = "10350049_Th.csv" )
# fill up all the NA in your
# hobo_hourly from chapter 3.3 based on a regression model between your station
# and a reference station.
# WBI Station
# data import from hdd
WBI_raw <- read.csv("Stunde_096.csv", sep=";")
# convert to proper POSX or ld object
WBI_raw$Tag <- dmy(WBI_raw$Tag)
class(WBI_raw$Tag)
[1] "Date"
# works semi well with this timestamp...
WBI_raw$Stunde <- hm(WBI_raw$Stunde)
class(WBI_raw$Stunde)
[1] "Period"
attr(,"package")
[1] "lubridate"
# i'll make my own
# declare time frame
start_time <- ymd_hms("2022-12-01 00:00:00")
end_time <- ymd_hms("2023-01-07 23:50:00")
time_range <- interval(start_time, end_time)
# make my own dttm and scrap the ones that came with the WBI data
dttm <- seq(start_time, end_time, by= "hours")
# clip WBI data, add dttm and rename cols
Wdat_WBI <- WBI_raw %>%
filter(.,between(Tag, date(start_time), date(end_time))) %>%
select(.,temp=AVG_TA200) %>%
mutate(dttm=dttm)
# TODO format the timestamp to get rid of seconds eventually (if the assignment says so)
# FREIBURG Garten station
GAR_raw <- read.csv("Freiburg_Garten_2022-11-30_2023-01-08.csv")
# correct missing line in GAR_raw
GAR_raw <- GAR_raw %>%
add_row(UTC = "2022-12-16 10:00:00", Lokalzeit = "2022-12-16 11:00:00", Lufttemperatur...C. = NA, .before = 395)
GAR_raw[390:396,]
# that's better, now the whole converting and clipping procedure again
# GAR_raw$Lokalzeit <- ymd_hms(GAR_raw$Lokalzeit, tz="Europe/Berlin")
# GAR_raw$UTC <- ymd_hms(GAR_raw$UTC, tz="UTC")
# class(GAR_raw$Lokalzeit)
# class(GAR_raw$UTC)
# convert to POSX
GAR_raw$Lokalzeit <- ymd_hms(GAR_raw$Lokalzeit)
GAR_raw$UTC <- ymd_hms(GAR_raw$UTC)
class(GAR_raw$UTC)
[1] "POSIXct" "POSIXt"
Wdat_GAR <- GAR_raw %>%
filter(., between(Lokalzeit, start_time, end_time))
# this took way too long...
# Freiburg Stadtklimastation
URB_raw <- read.csv("produkt_air_temperature_13667_akt.txt", sep=";")
# convert POSX
URB_raw$MESS_DATUM <- ymd_h(URB_raw$MESS_DATUM)
# and clip
Wdat_URB <- URB_raw %>%
filter(.,between(MESS_DATUM, start_time, end_time)) %>%
select(MESS_DATUM, LUFTTEMPERATUR) %>%
mutate(dttm=dttm)
# phew..correct nr of observations this time..can deselect orig. timestamp later
# TODO
# DWD Station 1443
DWD_raw <- read.csv("produkt_tu_stunde_20210718_20230118_01443.txt", sep=";")
# POSX conversio
DWD_raw$MESS_DATUM <- ymd_h(DWD_raw$MESS_DATUM)
Wdat_DWD <- DWD_raw %>%
filter(., between(MESS_DATUM, start_time, end_time)) %>%
select(., MESS_DATUM, TT_TU) %>%
mutate(dttm=dttm)
# check datetimes
#
# test <- Wdat_DWD %>%
# select(MESS_DATUM, dttm) %>%
# mutate(Gartime=Wdat_GAR$Lokalzeit) %>%
# mutate(Urbtime=Wdat_URB$MESS_DATUM) %>%
# mutate(wbitime=Wdat_WBI)
# jawoll this worked...
# stations df
tempDF <- Wdat_DWD %>%
select(., time=dttm, tempDWD=TT_TU) %>%
mutate(tempURB=Wdat_URB$LUFTTEMPERATUR) %>%
mutate(tempGar=Wdat_GAR$Lufttemperatur...C.) %>%
mutate(tempWBI=Wdat_WBI$temp)
# tempWBI still has the comma decimals
tempDF$tempWBI <- scan(text=tempDF$tempWBI, dec=",", sep=".")
Read 912 items
# my hobo df
# getwd()
myHOBO <- read.csv("10350049_Th.csv" )
tempDF <- tempDF %>%
mutate(myHOBO$th)
# # tempDF <- as_tibble(tempDF)
# tempDF$tempWBI <- as.numeric(tempDF$tempWBI)
tempDF <- cbind(tempDF,HOBOtemp=myHOBO$th)
tempDF$HOBOtemp <- as.numeric(tempDF$HOBOtemp)
Warning: NAs introduced by coercion
# manualy zoomed in comparison graph
date1 <- tempDF$time[210]
date2 <- tempDF$time[245]
ggplot(tempDF,aes(time))+
geom_line(tempDF,mapping=aes(y=tempURB),color="blue")+
geom_line(tempDF,mapping=aes(y=tempGar),color="black")+
geom_line(tempDF,mapping=aes(y=tempWBI),color="green")+
geom_line(tempDF,mapping=aes(y=tempDWD),color="red")+
geom_line(tempDF,mapping=aes(y=HOBOtemp),color="purple")+
ylim(-5,5)+
xlim(c(date1,date2))
# tempWBI is the closest fit
ggplot(tempDF,aes(time))+
#geom_line(tempDF,mapping=aes(y=tempURB),color="blue")+
#geom_line(tempDF,mapping=aes(y=tempGar),color="black")+
geom_line(tempDF,mapping=aes(y=tempWBI),color="green")+
#geom_line(tempDF,mapping=aes(y=tempDWD),color="red")+
geom_line(tempDF,mapping=aes(y=HOBOtemp),color="purple")
#ylim(-5,5)+
#xlim(c(date1,date2))
# TODO plot legends and axis labels, color palette
# MODEL 1: WBI
modWBI <- lm(tempDF$HOBOtemp~tempDF$tempWBI, tempDF)
summary(modWBI)
Call:
lm(formula = tempDF$HOBOtemp ~ tempDF$tempWBI, data = tempDF)
Residuals:
Min 1Q Median 3Q Max
-3.9860 -0.4251 -0.0575 0.3632 3.9620
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.384386 0.037086 10.37 <2e-16 ***
tempDF$tempWBI 0.910673 0.004279 212.83 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8583 on 860 degrees of freedom
(50 observations deleted due to missingness)
Multiple R-squared: 0.9814, Adjusted R-squared: 0.9813
F-statistic: 4.53e+04 on 1 and 860 DF, p-value: < 2.2e-16
# MODEL 2: DWD
modDWD <- lm(tempDF$HOBOtemp~tempDF$tempDWD, tempDF)
summary(modDWD)
Call:
lm(formula = tempDF$HOBOtemp ~ tempDF$tempDWD, data = tempDF)
Residuals:
Min 1Q Median 3Q Max
-13.734 -5.084 -0.130 5.535 12.591
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.9510105 0.2121931 23.333 < 2e-16 ***
tempDF$tempDWD -0.0069749 0.0009853 -7.079 3.02e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.113 on 860 degrees of freedom
(50 observations deleted due to missingness)
Multiple R-squared: 0.05506, Adjusted R-squared: 0.05396
F-statistic: 50.11 on 1 and 860 DF, p-value: 3.021e-12
# MODEL 3: URB
modURB <- lm(tempDF$HOBOtemp~tempDF$tempURB, tempDF)
summary(modURB)
Call:
lm(formula = tempDF$HOBOtemp ~ tempDF$tempURB, data = tempDF)
Residuals:
Min 1Q Median 3Q Max
-3.5554 -0.4248 0.0194 0.3621 3.9142
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.340480 0.037668 -9.039 <2e-16 ***
tempDF$tempURB 0.943340 0.004249 221.999 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8235 on 860 degrees of freedom
(50 observations deleted due to missingness)
Multiple R-squared: 0.9828, Adjusted R-squared: 0.9828
F-statistic: 4.928e+04 on 1 and 860 DF, p-value: < 2.2e-16
# MODEL 4: GAR
modGAR <- lm(tempDF$HOBOtemp~tempDF$tempGar, tempDF)
summary(modGAR)
Call:
lm(formula = tempDF$HOBOtemp ~ tempDF$tempGar, data = tempDF)
Residuals:
Min 1Q Median 3Q Max
-3.4704 -0.4985 -0.0718 0.3866 6.0194
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.164629 0.051511 -3.196 0.00144 **
tempDF$tempGar 0.933225 0.005853 159.446 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.137 on 859 degrees of freedom
(51 observations deleted due to missingness)
Multiple R-squared: 0.9673, Adjusted R-squared: 0.9673
F-statistic: 2.542e+04 on 1 and 859 DF, p-value: < 2.2e-16
# table of model coefficients & R^2
Intercepts <- c(summary(modDWD)$coefficients[1,4],
summary(modURB)$coefficients[1,4],
summary(modGAR)$coefficients[1,4],
summary(modWBI)$coefficients[1,4])
R_squ <- c(summary(modDWD)$r.squared,
summary(modURB)$r.squared,
summary(modGAR)$r.squared,
summary(modWBI)$r.squared)
stations <- c("DWD","URB","GAR","WBI")
MOD_RES <- arrange(tibble(stations,Intercepts,R_squ))
summary(modWBI)
Call:
lm(formula = tempDF$HOBOtemp ~ tempDF$tempWBI, data = tempDF)
Residuals:
Min 1Q Median 3Q Max
-3.9860 -0.4251 -0.0575 0.3632 3.9620
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.384386 0.037086 10.37 <2e-16 ***
tempDF$tempWBI 0.910673 0.004279 212.83 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8583 on 860 degrees of freedom
(50 observations deleted due to missingness)
Multiple R-squared: 0.9814, Adjusted R-squared: 0.9813
F-statistic: 4.53e+04 on 1 and 860 DF, p-value: < 2.2e-16
# the WBI model sports the best fit according to R^2
# corroborates what we saw in the plot
# fill in NA using regression coefficients from the WBI model
predDF <- tibble(HOBOtemp=tempDF$HOBOtemp,tempWBI=tempDF$tempWBI)
preds <- predict(modWBI, tempDF,type="response")
comparison <- tibble(HOBOorig=tempDF$HOBOtemp,
tempWBI=tempDF$tempWBI,
predicted=preds)
head(comparison)
# make corrected HOBO vector
# create result df
# fill in NAs with model predictions
tempDF <- tempDF %>%
mutate(HOBOcorr=case_when(is.na(HOBOtemp)~preds,is.numeric(HOBOtemp)~HOBOtemp))
hobo_hr_corr <- tempDF %>%
select(dttm=time, th=HOBOcorr) %>%
mutate(origin="H")
hobo_hr_corr$origin[which(is.na(tempDF$HOBOtemp))] <- "R"
hobo_hr_corr$dttm <- format(hobo_hr_corr$dttm, "%Y-%m-%d %H:%M:%S")
hobo_hr_corr$th <- format(hobo_hr_corr$th, digits=3, nsmall=3)
write_csv(hobo_hr_corr, file = "10350049_Th.csv" )
qc2 <- rollapply(HOBO_wip$delta, width=6,FUN=flagP) # width parameter is window
Calculate temperature indices for only one timeseries. You can pick any HOBO you like. Needed indices: * Calculate the mean temperature * Calculate the mean night temperature * Calculate the daytime coefficient of variation
Calculate temperature indices for all HOBOs of this term: The indices necessary are:
Now apply the same query used in the last task again, but for the other kind of data.
In R, calculate either of the following indices and finally merge the two tables: * Mean time lag between maximum light intensity and maximum temperature * Mean of the maximum daily temperature change (per hour)
First, download the data you want to use:
Finally, with the index in place, download the indices overview as created in the last two tasks and combine them in R into one overview table:
The data comes from OpenStreetMap, the largest community driven effort to build a database of global structural geodata, which can be used by everyone under a ODbL license, which includes commercial uses. Geodata standards, like the OSM model but also European standards like INSPIRE are extremely important to make public geodata usable. These datasets are heavily structured and their use often involves many dimensions and large data amounts. Without a standard, each authority would publish different data and a collective use is in fact impossible.
Filter the database for only the city districts, that contain a reference station and present them in either in a human readable table or a map.
First, create a sub-query/view/with statement containing all reference stations. Then extent this with the needed filter
Query all city districts of Freiburg that contain at least three HOBOs and aggregate the indices calculated in Create indices table/view/query for each of these districts and present them as a table. Repeat the procedure for the HOBO locations of WT21 or WT22 (or both). Are there differences? Describe.
Use the queries constructed in the last task to create a map of Freiburg, that illustrates differences in one (or more) of the temperature indices between the city districts of Freiburg. You can create this map either in R or in QGis.
Get the data here:
Either include the map or make the visualization here: